## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     /var/folders/qw/2tnkb3b11dncn1d6lmqs7rh40000gn/T//RtmpuWuMJp/h2o_krishnaprasad_started_from_r.out
##     /var/folders/qw/2tnkb3b11dncn1d6lmqs7rh40000gn/T//RtmpuWuMJp/h2o_krishnaprasad_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: .. Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 seconds 854 milliseconds 
##     H2O cluster timezone:       America/Denver 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.28.0.2 
##     H2O cluster version age:    1 month and 6 days  
##     H2O cluster name:           H2O_started_from_R_krishnaprasad_tnc829 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   4.00 GB 
##     H2O cluster total cores:    12 
##     H2O cluster allowed cores:  12 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 3.6.1 (2019-07-05)
# Read Classification dataset from RDS
class.data <- readRDS("KDEN_Class_Data_New.RDS")

# Reorder data frame by Date 
class.data <- class.data[order(class.data$DATE), ]

class.data$PRCP_LAG_1 <- lag(class.data$PRCP, k = 1)

# class.data$TEMP_LAG_1 <- lag(class.data$TEMP, k = 1)

class.data <- class.data[complete.cases(class.data), ]

class.data <-
  class.data[!(class.data$MXSPD == 999.9 |
                 class.data$PRCP == 99.99 | class.data$PRCP_LAG_1 == 99.99),]

class.data$FOG <- as.factor(class.data$FOG)
class.data$SNOW_ICE <- as.factor(class.data$SNOW_ICE)

class.data$STRIKE <-
  as.factor(ifelse(class.data$STRIKE == 0, "NO", "YES"))
# one-hot-encoding categorical features
ohe_feats = c('MONTH')

# Create dummies
dummies <- dummyVars(~ MONTH, data = class.data)

df.dummies <- as.data.frame(predict(dummies, newdata = class.data))

# Merge Dummies to data frame
class.data <-
  cbind(class.data[, -c(which(colnames(class.data) %in% ohe_feats))], df.dummies)


valid.cl.data <-
  class.data[(class.data$YEAR == 2019), ]

class.data <-
  class.data %>% filter(!class.data$YEAR %in% c(1995:2007, 2019))

class.data <- subset(class.data, select = -c(DATE, YEAR, SEASON, MXSPD, SNOW_ICE, MONTH.12, STRIKECOUNT))

valid.cl.data <- subset(valid.cl.data, select = -c(DATE, YEAR, SEASON, MXSPD, SNOW_ICE, MONTH.12, STRIKECOUNT))
# Create the training and test datasets
set.seed(100)

class.data$STRIKE <- as.factor(class.data$STRIKE)

# Step 1: Get row numbers for the training data
trainRowNumbers.cl <-
  createDataPartition(class.data$STRIKE, p = 0.70, list = FALSE)

# Step 2: Create the training  dataset
train.data <- class.data[trainRowNumbers.cl, ]

# Step 3: Create the test dataset
test.data <- class.data[-trainRowNumbers.cl, ]
validateAndPrintResult <- function(model, data) {
  # Summarise Results
  print(model)
  
  ## run MLeval
  res <- evalm(model)
  
  ## get ROC
  
  res$roc
  
  ## get calibration curve
  
  res$cc
  
  ## get precision recall gain curve
  
  res$prg
  
  # Predict on testData
  predicted.resp <- predict(model, data)
  head(predicted.resp)
  
  caret::confusionMatrix(
    reference = as.factor(data$STRIKE),
    data = predicted.resp,
    mode = 'everything',
    positive = 'YES'
  )
}
# # 5 Fold cross validation with Probabilities
# tc <- trainControl(
#   method = "cv",
#   number = 5,
#   savePredictions = "final",
#   classProbs = TRUE,
#   verboseIter = TRUE,
#   summaryFunction = twoClassSummary
# )
# model.metrics.class <-
#   data.frame(
#     "Model_Name" = character(0),
#     "Data_Type" = character(0),
#     "AUC" = numeric(0),
#     "Accuracy" = numeric(0),
#     "Sensitivity" = numeric(0),
#     "Specificity" = numeric(0),
#     "Precision" = numeric(0),
#     "Recall" = numeric(0),
#     "Kappa" = numeric(0),
#     "Confusion_Matrix" = character(0),
#     stringsAsFactors = FALSE
#   )
# saveModelMetrics <-
#   function(model.predictions,
#            data,
#            model.name,
#            data.type) {
# 
#     # auc
#     
#     # accuracy
#     
#     # sensitivity
#     
#     # specificity
#     
#     # precision
#     
#     # recall
#     
#     # kappa
#     
#     # confusionMatrix
#     
#     
#     # combine all the above validation metrics
#     model.summary <-
#       c(model.name,
#         data.type,
#         auc,
#         accuracy,
#         sensitivity,
#         specificity,
#         precision,
#         recall,
#         kappa,
#         confusionMatrix)
#     
#     return(model.summary)
#   }
# X_train = sparse.model.matrix(as.formula(paste(
#   "STRIKE ~", paste(colnames(train.data[, -11]), sep = "", collapse = " +")
# )), data = train.data)
# 
# y_train <- as.factor(train.data[,11])
# 
# X_test = sparse.model.matrix(as.formula(paste(
#   "STRIKE ~", paste(colnames(test.data[, -11]), sep = "", collapse = " +")
# )), data = test.data)
# 
# y_test <- as.factor(test.data[,11])
# 
# X_val = sparse.model.matrix(as.formula(paste(
#   "STRIKE ~", paste(colnames(valid.cl.data[, -11]), sep = "", collapse = " +")
# )), data = valid.cl.data)
# 
# y_val <- as.factor(valid.cl.data[,11])
# 
# # ELASTIC NET WITH 0 < ALPHA < 1
# a <- seq(0.1, 0.9, 0.05)
# search <- foreach(i = a, .combine = rbind) %dopar% {
#   cv <-
#     cv.glmnet(
#       X_train,
#       y_train,
#       family = "binomial",
#       nfold = 10,
#       type.measure = "deviance",
#       parallel = TRUE,
#       alpha = i
#     )
#   data.frame(
#     cvm = cv$cvm[cv$lambda == cv$lambda.1se],
#     lambda.1se = cv$lambda.1se,
#     alpha = i
#   )
# }
# plot(search$lambda.1se)
# cv3 <- search[search$cvm == min(search$cvm), ]
# 
# 
# model.glmnet <-
#   glmnet(
#     X_train,
#     y_train,
#     family = "binomial",
#     lambda = cv3$lambda.1se,
#     alpha = cv3$alpha
#   )
# coef(model.glmnet)
# 
# model.glmnet
# 
# summary(model.glmnet)
# 
# preds <- predict(model.glmnet, X_test, type = "response")
# 
# # Calculate true positive rate and false positive rate on the prediction object
# perf <- performance(prediction(preds, y_test), 'tpr', 'fpr')
# roc.auc.glmnet <- performance(prediction(preds, y_test), "auc")
# plot(perf, main = paste("ROC - Elastic Net"," | ","AUC - ", roc.auc.glmnet@y.values), colorize = TRUE) # plot ROC curve
# lines(c(0, 1), c(0, 1), col = "gray", lty = 4)
# 
# 
# predicted <- predict(model.glmnet, X_val, type = "response")
# 
# # Calculate true positive rate and false positive rate on the prediction object
# perf <- performance(prediction(predicted, y_val), 'tpr', 'fpr')
# roc.auc.glmnet <- performance(prediction(predicted, y_val), "auc")
# plot(perf, main = paste("ROC - Elastic Net"," | ","AUC - ", roc.auc.glmnet@y.values), colorize = TRUE) # plot ROC curve
# lines(c(0, 1), c(0, 1), col = "gray", lty = 4)
# 
# 
# 
# optCutOff <- optimalCutoff(y_val, predicted, optimiseFor = "Both", returnDiagnostics = T)
# optCutOff$
# 
# 
# #Misclassify Errors - needs to be low
# misClassError(y_val, predicted, threshold = optCutOff$optimalCutoff)
# 
# # Concordance - needs to be high
# Concordance(y_val, predicted)
# 
# sensitivity(y_val, predicted, threshold = optCutOff)
# 
# specificity(y_val, predicted, threshold = optCutOff)
# 
# # Confusion Matrix
# test <- confusionMatrix(y_val, predicted, threshold = optCutOff)
# 
# ks_plot(y_val, predicted[1])
# 
# 
# predicted.class <- predict(model.glmnet, X_val, type = "class")
# 
# # Output dataframe with probabilities
# output.data <- cbind(valid.cl.data, predicted)
# output.data <- cbind(output.data, predicted.class)
trControl <- trainControl(

        method="cv",

        number=7,

        savePredictions="final",

        index=createResample(as.factor(train.data$STRIKE), 7),
        
        classProbs = TRUE,
        
        summaryFunction = twoClassSummary,

        allowParallel =TRUE

)


 
# col_sample_rate
# <chr>
# learn_rate
# <chr>
# max_depth
# <chr>
# ntrees
# <chr>
# sample_rate
# <chr>
# model_ids
# <chr>
# auc
# <chr>
# 0.2509    0.0072  7   203 0.7626  grid_binomial_xgb_55_model_6    0.7270077693205641

xgbTreeGrid <- expand.grid(nrounds = 500, max_depth = seq(2,8,by = 1), eta = 0.1, gamma = 0, colsample_bytree = 1.0,  subsample = 1.0, min_child_weight = 4)

glmnetGridElastic <- expand.grid(.alpha = 0.3, .lambda = 0.009) ## notice the . before the parameter


# col_sample_rate
# <chr>
# learn_rate
# <chr>
# max_depth
# <chr>
# ntrees
# <chr>
# sample_rate
# <chr>
# model_ids
# <chr>
# auc
# <chr>
#   0.4087  0.1532  1   385 0.9297
gbm.tune.grid <- expand.grid(.n.trees = c(400), .interaction.depth = c(1, 3, 5), .shrinkage = c(.01, .1, .3), .n.minobsinnode = c(5, 10, 15))


set.seed(333)

modelList <- caretList(

                  STRIKE ~ ., 

                  train.data,

                  trControl=trControl,

                  metric = "ROC", 
                  
                  verbose = TRUE,

                  tuneList=list(

                  ## Do not use custom names in list. Will give prediction error with greedy ensemble. Bug in caret.

                          xgbTree = caretModelSpec(method="xgbTree",  tuneGrid = xgbTreeGrid, nthread = 8),

                          glmnet = caretModelSpec(method = "glmnet", tuneGrid = glmnetGridElastic),  ## Elastic, highly correlated with lasso and ridge regressions
                          
                          rf = caretModelSpec(method = "rf", ntree = 2000, tuneLength = 20, tuneGrid = data.frame(mtry = 10)),  ## rf
                          
                          gbm = caretModelSpec(method = "gbm", tuneGrid = gbm.tune.grid) 

                          )
)
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.3807             nan     0.0100    0.0016
##      2        1.3775             nan     0.0100    0.0016
##      3        1.3743             nan     0.0100    0.0015
##      4        1.3712             nan     0.0100    0.0015
##      5        1.3680             nan     0.0100    0.0014
##      6        1.3650             nan     0.0100    0.0014
##      7        1.3623             nan     0.0100    0.0013
##      8        1.3592             nan     0.0100    0.0014
##      9        1.3562             nan     0.0100    0.0014
##     10        1.3536             nan     0.0100    0.0012
##     20        1.3280             nan     0.0100    0.0011
##     40        1.2882             nan     0.0100    0.0008
##     60        1.2591             nan     0.0100    0.0005
##     80        1.2375             nan     0.0100    0.0005
##    100        1.2207             nan     0.0100    0.0003
##    120        1.2081             nan     0.0100    0.0002
##    140        1.1977             nan     0.0100    0.0001
##    160        1.1894             nan     0.0100    0.0000
##    180        1.1823             nan     0.0100    0.0001
##    200        1.1769             nan     0.0100    0.0001
##    220        1.1717             nan     0.0100    0.0001
##    240        1.1670             nan     0.0100    0.0001
##    260        1.1636             nan     0.0100   -0.0000
##    280        1.1601             nan     0.0100   -0.0000
##    300        1.1567             nan     0.0100   -0.0000
##    320        1.1539             nan     0.0100   -0.0000
##    340        1.1511             nan     0.0100   -0.0000
##    360        1.1485             nan     0.0100   -0.0000
##    380        1.1461             nan     0.0100   -0.0001
##    400        1.1436             nan     0.0100   -0.0000
# gbm.pred <- predict(modelList$gbm, newdata = test.data, type = 'raw')
# 
# ## run MLeval
# res <- evalm(modelList$gbm)
#   
# 
# ## get ROC
# 
# res$roc
# 
# ## get calibration curve
# 
# res$cc
# 
# ## get precision recall gain curve
# 
# res$prg
# 
# caret::confusionMatrix(
#   reference = as.factor(test.data$STRIKE),
#   data = gbm.pred,
#   mode = 'everything',
#   positive = 'YES'
# )

validateAndPrintResult(modelList$xgbTree, test.data)
## eXtreme Gradient Boosting 
## 
## 2813 samples
##   20 predictor
##    2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2813, 2813, 2813, 2813, 2813, 2813, ... 
## Resampling results across tuning parameters:
## 
##   max_depth  ROC        Sens       Spec     
##   2          0.7258595  0.6501899  0.6938797
##   3          0.7088709  0.6424089  0.6845110
##   4          0.7005036  0.6267529  0.6831272
##   5          0.6967093  0.6232290  0.6722892
##   6          0.6951912  0.6188989  0.6751165
##   7          0.6927515  0.6205358  0.6723515
##   8          0.6909788  0.6225654  0.6689167
## 
## Tuning parameter 'nrounds' was held constant at a value of 500
## Tuning
##  parameter 'min_child_weight' was held constant at a value of 4
## 
## Tuning parameter 'subsample' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 500, max_depth = 2, eta
##  = 0.1, gamma = 0, colsample_bytree = 1, min_child_weight = 4 and subsample = 1.
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 7255
## Number of groups: 1
## Observations per group: 7255
## Positive: YES
## Negative: NO
## Group: Group 1
## Positive: 3784
## Negative: 3471
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.353792721084333
## Group 1 AUC-ROC = 0.73

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  NO YES
##        NO  387 230
##        YES 186 401
##                                           
##                Accuracy : 0.6545          
##                  95% CI : (0.6269, 0.6814)
##     No Information Rate : 0.5241          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.3098          
##                                           
##  Mcnemar's Test P-Value : 0.03501         
##                                           
##             Sensitivity : 0.6355          
##             Specificity : 0.6754          
##          Pos Pred Value : 0.6831          
##          Neg Pred Value : 0.6272          
##               Precision : 0.6831          
##                  Recall : 0.6355          
##                      F1 : 0.6585          
##              Prevalence : 0.5241          
##          Detection Rate : 0.3331          
##    Detection Prevalence : 0.4875          
##       Balanced Accuracy : 0.6554          
##                                           
##        'Positive' Class : YES             
## 
validateAndPrintResult(modelList$xgbTree, valid.cl.data)
## eXtreme Gradient Boosting 
## 
## 2813 samples
##   20 predictor
##    2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2813, 2813, 2813, 2813, 2813, 2813, ... 
## Resampling results across tuning parameters:
## 
##   max_depth  ROC        Sens       Spec     
##   2          0.7258595  0.6501899  0.6938797
##   3          0.7088709  0.6424089  0.6845110
##   4          0.7005036  0.6267529  0.6831272
##   5          0.6967093  0.6232290  0.6722892
##   6          0.6951912  0.6188989  0.6751165
##   7          0.6927515  0.6205358  0.6723515
##   8          0.6909788  0.6225654  0.6689167
## 
## Tuning parameter 'nrounds' was held constant at a value of 500
## Tuning
##  parameter 'min_child_weight' was held constant at a value of 4
## 
## Tuning parameter 'subsample' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 500, max_depth = 2, eta
##  = 0.1, gamma = 0, colsample_bytree = 1, min_child_weight = 4 and subsample = 1.
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 7255
## Number of groups: 1
## Observations per group: 7255
## Positive: YES
## Negative: NO
## Group: Group 1
## Positive: 3784
## Negative: 3471
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.353792721084333
## Group 1 AUC-ROC = 0.73

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  NO YES
##        NO   73  32
##        YES  83 115
##                                           
##                Accuracy : 0.6205          
##                  95% CI : (0.5632, 0.6753)
##     No Information Rate : 0.5149          
##     P-Value [Acc > NIR] : 0.0001362       
##                                           
##                   Kappa : 0.2478          
##                                           
##  Mcnemar's Test P-Value : 3.124e-06       
##                                           
##             Sensitivity : 0.7823          
##             Specificity : 0.4679          
##          Pos Pred Value : 0.5808          
##          Neg Pred Value : 0.6952          
##               Precision : 0.5808          
##                  Recall : 0.7823          
##                      F1 : 0.6667          
##              Prevalence : 0.4851          
##          Detection Rate : 0.3795          
##    Detection Prevalence : 0.6535          
##       Balanced Accuracy : 0.6251          
##                                           
##        'Positive' Class : YES             
## 
validateAndPrintResult(modelList$glmnet, test.data)
## glmnet 
## 
## 2813 samples
##   20 predictor
##    2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2813, 2813, 2813, 2813, 2813, 2813, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.7445988  0.7169431  0.6709229
## 
## Tuning parameter 'alpha' was held constant at a value of 0.3
## Tuning
##  parameter 'lambda' was held constant at a value of 0.009
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 7255
## Number of groups: 1
## Observations per group: 7255
## Positive: YES
## Negative: NO
## Group: Group 1
## Positive: 3784
## Negative: 3471
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.39212299981179
## Group 1 AUC-ROC = 0.74

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  NO YES
##        NO  405 231
##        YES 168 400
##                                           
##                Accuracy : 0.6686          
##                  95% CI : (0.6412, 0.6952)
##     No Information Rate : 0.5241          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.339           
##                                           
##  Mcnemar's Test P-Value : 0.00191         
##                                           
##             Sensitivity : 0.6339          
##             Specificity : 0.7068          
##          Pos Pred Value : 0.7042          
##          Neg Pred Value : 0.6368          
##               Precision : 0.7042          
##                  Recall : 0.6339          
##                      F1 : 0.6672          
##              Prevalence : 0.5241          
##          Detection Rate : 0.3322          
##    Detection Prevalence : 0.4718          
##       Balanced Accuracy : 0.6704          
##                                           
##        'Positive' Class : YES             
## 
validateAndPrintResult(modelList$glmnet, valid.cl.data)
## glmnet 
## 
## 2813 samples
##   20 predictor
##    2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2813, 2813, 2813, 2813, 2813, 2813, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.7445988  0.7169431  0.6709229
## 
## Tuning parameter 'alpha' was held constant at a value of 0.3
## Tuning
##  parameter 'lambda' was held constant at a value of 0.009
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 7255
## Number of groups: 1
## Observations per group: 7255
## Positive: YES
## Negative: NO
## Group: Group 1
## Positive: 3784
## Negative: 3471
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.39212299981179
## Group 1 AUC-ROC = 0.74

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  NO YES
##        NO   88  33
##        YES  68 114
##                                           
##                Accuracy : 0.6667          
##                  95% CI : (0.6105, 0.7195)
##     No Information Rate : 0.5149          
##     P-Value [Acc > NIR] : 6.278e-08       
##                                           
##                   Kappa : 0.3373          
##                                           
##  Mcnemar's Test P-Value : 0.0007167       
##                                           
##             Sensitivity : 0.7755          
##             Specificity : 0.5641          
##          Pos Pred Value : 0.6264          
##          Neg Pred Value : 0.7273          
##               Precision : 0.6264          
##                  Recall : 0.7755          
##                      F1 : 0.6930          
##              Prevalence : 0.4851          
##          Detection Rate : 0.3762          
##    Detection Prevalence : 0.6007          
##       Balanced Accuracy : 0.6698          
##                                           
##        'Positive' Class : YES             
## 
validateAndPrintResult(modelList$rf, test.data)
## Random Forest 
## 
## 2813 samples
##   20 predictor
##    2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2813, 2813, 2813, 2813, 2813, 2813, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.7181348  0.6595506  0.6801559
## 
## Tuning parameter 'mtry' was held constant at a value of 10
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 7255
## Number of groups: 1
## Observations per group: 7255
## Positive: YES
## Negative: NO
## Group: Group 1
## Positive: 3784
## Negative: 3471
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.339792012708135
## Group 1 AUC-ROC = 0.72

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  NO YES
##        NO  391 242
##        YES 182 389
##                                           
##                Accuracy : 0.6478          
##                  95% CI : (0.6201, 0.6748)
##     No Information Rate : 0.5241          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2974          
##                                           
##  Mcnemar's Test P-Value : 0.004166        
##                                           
##             Sensitivity : 0.6165          
##             Specificity : 0.6824          
##          Pos Pred Value : 0.6813          
##          Neg Pred Value : 0.6177          
##               Precision : 0.6813          
##                  Recall : 0.6165          
##                      F1 : 0.6473          
##              Prevalence : 0.5241          
##          Detection Rate : 0.3231          
##    Detection Prevalence : 0.4743          
##       Balanced Accuracy : 0.6494          
##                                           
##        'Positive' Class : YES             
## 
validateAndPrintResult(modelList$rf, valid.cl.data)
## Random Forest 
## 
## 2813 samples
##   20 predictor
##    2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2813, 2813, 2813, 2813, 2813, 2813, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.7181348  0.6595506  0.6801559
## 
## Tuning parameter 'mtry' was held constant at a value of 10
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 7255
## Number of groups: 1
## Observations per group: 7255
## Positive: YES
## Negative: NO
## Group: Group 1
## Positive: 3784
## Negative: 3471
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.339792012708135
## Group 1 AUC-ROC = 0.72

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  NO YES
##        NO   84  31
##        YES  72 116
##                                           
##                Accuracy : 0.6601          
##                  95% CI : (0.6037, 0.7133)
##     No Information Rate : 0.5149          
##     P-Value [Acc > NIR] : 2.225e-07       
##                                           
##                   Kappa : 0.325           
##                                           
##  Mcnemar's Test P-Value : 8.104e-05       
##                                           
##             Sensitivity : 0.7891          
##             Specificity : 0.5385          
##          Pos Pred Value : 0.6170          
##          Neg Pred Value : 0.7304          
##               Precision : 0.6170          
##                  Recall : 0.7891          
##                      F1 : 0.6925          
##              Prevalence : 0.4851          
##          Detection Rate : 0.3828          
##    Detection Prevalence : 0.6205          
##       Balanced Accuracy : 0.6638          
##                                           
##        'Positive' Class : YES             
## 
validateAndPrintResult(modelList$gbm, test.data)
## Stochastic Gradient Boosting 
## 
## 2813 samples
##   20 predictor
##    2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2813, 2813, 2813, 2813, 2813, 2813, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.minobsinnode  ROC        Sens       Spec     
##   0.01       1                   5              0.7405019  0.6775723  0.6978071
##   0.01       1                  10              0.7398820  0.6778053  0.6981943
##   0.01       1                  15              0.7400998  0.6780858  0.6987150
##   0.01       3                   5              0.7500151  0.6730403  0.7134005
##   0.01       3                  10              0.7498429  0.6749600  0.7107830
##   0.01       3                  15              0.7494104  0.6757906  0.7118014
##   0.01       5                   5              0.7482403  0.6713271  0.7096891
##   0.01       5                  10              0.7486500  0.6716447  0.7132164
##   0.01       5                  15              0.7484035  0.6727652  0.7145545
##   0.10       1                   5              0.7371101  0.6614214  0.7009731
##   0.10       1                  10              0.7358510  0.6619505  0.6999148
##   0.10       1                  15              0.7362413  0.6660909  0.7014915
##   0.10       3                   5              0.7183773  0.6384029  0.6898557
##   0.10       3                  10              0.7191055  0.6418065  0.6860941
##   0.10       3                  15              0.7187297  0.6404175  0.6903453
##   0.10       5                   5              0.7113287  0.6346190  0.6784474
##   0.10       5                  10              0.7054848  0.6252153  0.6850435
##   0.10       5                  15              0.7071891  0.6257201  0.6819229
##   0.30       1                   5              0.7157828  0.6374494  0.6890194
##   0.30       1                  10              0.7167345  0.6424918  0.6912493
##   0.30       1                  15              0.7201906  0.6451942  0.6954336
##   0.30       3                   5              0.6845439  0.6067691  0.6763076
##   0.30       3                  10              0.6797716  0.6064654  0.6597695
##   0.30       3                  15              0.6802359  0.5973145  0.6708190
##   0.30       5                   5              0.6748662  0.5967975  0.6578960
##   0.30       5                  10              0.6719867  0.6032923  0.6568789
##   0.30       5                  15              0.6757455  0.6042790  0.6569964
## 
## Tuning parameter 'n.trees' was held constant at a value of 400
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 400, interaction.depth =
##  3, shrinkage = 0.01 and n.minobsinnode = 5.
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 7255
## Number of groups: 1
## Observations per group: 7255
## Positive: YES
## Negative: NO
## Group: Group 1
## Positive: 3784
## Negative: 3471
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.400352010588488
## Group 1 AUC-ROC = 0.75

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  NO YES
##        NO  387 217
##        YES 186 414
##                                           
##                Accuracy : 0.6653          
##                  95% CI : (0.6378, 0.6919)
##     No Information Rate : 0.5241          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.3307          
##                                           
##  Mcnemar's Test P-Value : 0.1351          
##                                           
##             Sensitivity : 0.6561          
##             Specificity : 0.6754          
##          Pos Pred Value : 0.6900          
##          Neg Pred Value : 0.6407          
##               Precision : 0.6900          
##                  Recall : 0.6561          
##                      F1 : 0.6726          
##              Prevalence : 0.5241          
##          Detection Rate : 0.3439          
##    Detection Prevalence : 0.4983          
##       Balanced Accuracy : 0.6657          
##                                           
##        'Positive' Class : YES             
## 
validateAndPrintResult(modelList$gbm, valid.cl.data)
## Stochastic Gradient Boosting 
## 
## 2813 samples
##   20 predictor
##    2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2813, 2813, 2813, 2813, 2813, 2813, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.minobsinnode  ROC        Sens       Spec     
##   0.01       1                   5              0.7405019  0.6775723  0.6978071
##   0.01       1                  10              0.7398820  0.6778053  0.6981943
##   0.01       1                  15              0.7400998  0.6780858  0.6987150
##   0.01       3                   5              0.7500151  0.6730403  0.7134005
##   0.01       3                  10              0.7498429  0.6749600  0.7107830
##   0.01       3                  15              0.7494104  0.6757906  0.7118014
##   0.01       5                   5              0.7482403  0.6713271  0.7096891
##   0.01       5                  10              0.7486500  0.6716447  0.7132164
##   0.01       5                  15              0.7484035  0.6727652  0.7145545
##   0.10       1                   5              0.7371101  0.6614214  0.7009731
##   0.10       1                  10              0.7358510  0.6619505  0.6999148
##   0.10       1                  15              0.7362413  0.6660909  0.7014915
##   0.10       3                   5              0.7183773  0.6384029  0.6898557
##   0.10       3                  10              0.7191055  0.6418065  0.6860941
##   0.10       3                  15              0.7187297  0.6404175  0.6903453
##   0.10       5                   5              0.7113287  0.6346190  0.6784474
##   0.10       5                  10              0.7054848  0.6252153  0.6850435
##   0.10       5                  15              0.7071891  0.6257201  0.6819229
##   0.30       1                   5              0.7157828  0.6374494  0.6890194
##   0.30       1                  10              0.7167345  0.6424918  0.6912493
##   0.30       1                  15              0.7201906  0.6451942  0.6954336
##   0.30       3                   5              0.6845439  0.6067691  0.6763076
##   0.30       3                  10              0.6797716  0.6064654  0.6597695
##   0.30       3                  15              0.6802359  0.5973145  0.6708190
##   0.30       5                   5              0.6748662  0.5967975  0.6578960
##   0.30       5                  10              0.6719867  0.6032923  0.6568789
##   0.30       5                  15              0.6757455  0.6042790  0.6569964
## 
## Tuning parameter 'n.trees' was held constant at a value of 400
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 400, interaction.depth =
##  3, shrinkage = 0.01 and n.minobsinnode = 5.
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 7255
## Number of groups: 1
## Observations per group: 7255
## Positive: YES
## Negative: NO
## Group: Group 1
## Positive: 3784
## Negative: 3471
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.400352010588488
## Group 1 AUC-ROC = 0.75

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  NO YES
##        NO   86  29
##        YES  70 118
##                                           
##                Accuracy : 0.6733          
##                  95% CI : (0.6173, 0.7258)
##     No Information Rate : 0.5149          
##     P-Value [Acc > NIR] : 1.672e-08       
##                                           
##                   Kappa : 0.3512          
##                                           
##  Mcnemar's Test P-Value : 5.816e-05       
##                                           
##             Sensitivity : 0.8027          
##             Specificity : 0.5513          
##          Pos Pred Value : 0.6277          
##          Neg Pred Value : 0.7478          
##               Precision : 0.6277          
##                  Recall : 0.8027          
##                      F1 : 0.7045          
##              Prevalence : 0.4851          
##          Detection Rate : 0.3894          
##    Detection Prevalence : 0.6205          
##       Balanced Accuracy : 0.6770          
##                                           
##        'Positive' Class : YES             
## 
set.seed(333)

greedyEnsemble <- caretEnsemble(

  c(modelList$rf, modelList$xgbTree, modelList$gbm), 

  metric="ROC",

  trControl = trainControl(
    number = 7,
    method = "cv",
    classProbs = TRUE,
    verboseIter = TRUE
    
  ))
## Warning in train.default(predobs$preds, predobs$obs, ...): The metric "ROC" was
## not in the result set. Accuracy will be used instead.
## Aggregating results
## Fitting final model on full training set
greedyEnsemble
## A glm ensemble of 3 base models: rf1, xgbTree2, gbm3
## 
## Ensemble results:
## Generalized Linear Model 
## 
## 7255 samples
##    3 predictor
##    2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 6218, 6218, 6219, 6219, 6219, 6218, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.6941428  0.3870312
test.pred <- predict(greedyEnsemble, newdata = test.data)

caret::confusionMatrix(
  reference = as.factor(test.data$STRIKE),
  data = test.pred,
  mode = 'everything',
  positive = 'YES'
)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  NO YES
##        NO  392 218
##        YES 181 413
##                                           
##                Accuracy : 0.6686          
##                  95% CI : (0.6412, 0.6952)
##     No Information Rate : 0.5241          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.3376          
##                                           
##  Mcnemar's Test P-Value : 0.07151         
##                                           
##             Sensitivity : 0.6545          
##             Specificity : 0.6841          
##          Pos Pred Value : 0.6953          
##          Neg Pred Value : 0.6426          
##               Precision : 0.6953          
##                  Recall : 0.6545          
##                      F1 : 0.6743          
##              Prevalence : 0.5241          
##          Detection Rate : 0.3430          
##    Detection Prevalence : 0.4934          
##       Balanced Accuracy : 0.6693          
##                                           
##        'Positive' Class : YES             
## 
valid.pred <- predict(greedyEnsemble, newdata = valid.cl.data)

caret::confusionMatrix(
  reference = as.factor(valid.cl.data$STRIKE),
  data = valid.pred,
  mode = 'everything',
  positive = 'YES'
)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  NO YES
##        NO   87  29
##        YES  69 118
##                                           
##                Accuracy : 0.6766          
##                  95% CI : (0.6207, 0.7289)
##     No Information Rate : 0.5149          
##     P-Value [Acc > NIR] : 8.446e-09       
##                                           
##                   Kappa : 0.3576          
##                                           
##  Mcnemar's Test P-Value : 8.162e-05       
##                                           
##             Sensitivity : 0.8027          
##             Specificity : 0.5577          
##          Pos Pred Value : 0.6310          
##          Neg Pred Value : 0.7500          
##               Precision : 0.6310          
##                  Recall : 0.8027          
##                      F1 : 0.7066          
##              Prevalence : 0.4851          
##          Detection Rate : 0.3894          
##    Detection Prevalence : 0.6172          
##       Balanced Accuracy : 0.6802          
##                                           
##        'Positive' Class : YES             
## 
# Confusion Matrix and Statistics
# 
#           Reference
# Prediction  NO YES
#        NO  703 297
#        YES 235 497
#                                           
#                Accuracy : 0.6928          
#                  95% CI : (0.6705, 0.7145)
#     No Information Rate : 0.5416          
#     P-Value [Acc > NIR] : < 2.2e-16       
#                                           
#                   Kappa : 0.3777          
#                                           
#  Mcnemar's Test P-Value : 0.008177        
#                                           
#             Sensitivity : 0.6259          
#             Specificity : 0.7495          
#          Pos Pred Value : 0.6790          
#          Neg Pred Value : 0.7030          
#               Precision : 0.6790          
#                  Recall : 0.6259          
#                      F1 : 0.6514          
#              Prevalence : 0.4584          
#          Detection Rate : 0.2870          
#    Detection Prevalence : 0.4226          
#       Balanced Accuracy : 0.6877          
#                                           
#        'Positive' Class : YES             
#                                    
stack = caretStack(modelList, method="glm", trControl = trControl)
## Warning in train.default(predobs$preds, predobs$obs, ...): The metric "Accuracy"
## was not in the result set. ROC will be used instead.
stack
## A glm ensemble of 4 base models: xgbTree, glmnet, rf, gbm
## 
## Ensemble results:
## Generalized Linear Model 
## 
## 7255 samples
##    4 predictor
##    2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2813, 2813, 2813, 2813, 2813, 2813, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.7524129  0.6847667  0.7041084
test.pred <- predict(stack, newdata = test.data)

caret::confusionMatrix(
  reference = as.factor(test.data$STRIKE),
  data = test.pred,
  mode = 'everything',
  positive = 'YES'
)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  NO YES
##        NO  398 217
##        YES 175 414
##                                           
##                Accuracy : 0.6744          
##                  95% CI : (0.6471, 0.7008)
##     No Information Rate : 0.5241          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.3495          
##                                           
##  Mcnemar's Test P-Value : 0.03838         
##                                           
##             Sensitivity : 0.6561          
##             Specificity : 0.6946          
##          Pos Pred Value : 0.7029          
##          Neg Pred Value : 0.6472          
##               Precision : 0.7029          
##                  Recall : 0.6561          
##                      F1 : 0.6787          
##              Prevalence : 0.5241          
##          Detection Rate : 0.3439          
##    Detection Prevalence : 0.4892          
##       Balanced Accuracy : 0.6753          
##                                           
##        'Positive' Class : YES             
## 
valid.pred <- predict(stack, newdata = valid.cl.data)

caret::confusionMatrix(
  reference = as.factor(valid.cl.data$STRIKE),
  data = valid.pred,
  mode = 'everything',
  positive = 'YES'
)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  NO YES
##        NO   87  31
##        YES  69 116
##                                           
##                Accuracy : 0.67            
##                  95% CI : (0.6139, 0.7227)
##     No Information Rate : 0.5149          
##     P-Value [Acc > NIR] : 3.264e-08       
##                                           
##                   Kappa : 0.3442          
##                                           
##  Mcnemar's Test P-Value : 0.0002156       
##                                           
##             Sensitivity : 0.7891          
##             Specificity : 0.5577          
##          Pos Pred Value : 0.6270          
##          Neg Pred Value : 0.7373          
##               Precision : 0.6270          
##                  Recall : 0.7891          
##                      F1 : 0.6988          
##              Prevalence : 0.4851          
##          Detection Rate : 0.3828          
##    Detection Prevalence : 0.6106          
##       Balanced Accuracy : 0.6734          
##                                           
##        'Positive' Class : YES             
## 
# # all others may have just failed and are not listed here
# models.cla <- c("knn", "AdaBoost.M1", "rf", 'glmnet', 'xgboost')
# 
# # register parallel front-end
# cl.cla <- makeCluster(detectCores())
# registerDoParallel(cl.cla)
# 
# # this setup actually calls the caret::train function, in order to provide
# # minimal error handling this type of construct is needed.
# trainCall <- function(i)
# {
#   cat("----------------------------------------------------",
#       "\n")
#   
#   set.seed(123)
#   cat(i, " <- loaded\n")
#   
#   t2 <-
#     train(
#       train.data[, -11],
#       train.data[, c('STRIKE')],
#       method = i,
#       trControl = trainControl(method = "boot632",
#                                number = 5)
#     )
# }
# 
# # use lapply/loop to run everything, required for try/catch error function to work
# t2 <- lapply(models.cla, trainCall)
# 
# #remove NULL values, we only allow succesful methods, provenance is deleted.
# t2 <- t2[!sapply(t2, is.null)]
# 
# # this setup extracts the results with minimal error handling 
# # TrainKappa can be sometimes zero, but Accuracy SD can be still available
# printCall <- function(i)
# {
#   return(tryCatch({
#     cat(sprintf("%-22s", (models.cla[i])))
#     cat(round(getTrainPerf(t2[[i]])$TrainAccuracy, 4), "\t")
#     cat(round(getTrainPerf(t2[[i]])$TrainKappa, 4), "\t")
#     cat(t2[[i]]$times$everything[3], "\n")
#   },
#   error = function(e)
#     NULL))
# }
#   
# r2 <- lapply(1:length(t2), printCall)
# 
# # stop cluster and register sequntial front end
# stopCluster(cl.cla)
# registerDoSEQ()
# 
# 
# # preallocate data types
# i = 1; MAX = length(t2);
# x1 <- character() # Name
# x2 <- numeric()   # R2
# x3 <- numeric()   # RMSE
# x4 <- numeric()   # time [s]
# x5 <- character() # long model name
#  
# # fill data and check indexes and NA with loop/lapply
# for (i in 1:length(t2)) {
#   x1[i] <- t2[[i]]$method
#   x2[i] <-
#     as.numeric(round(getTrainPerf(t2[[i]])$TrainAccuracy, 4))
#   x3[i] <- as.numeric(round(getTrainPerf(t2[[i]])$TrainKappa, 4))
#   x4[i] <- as.numeric(t2[[i]]$times$everything[3])
#   x5[i] <- t2[[i]]$modelInfo$label
# }
#   
# # coerce to data frame
# df1 <- data.frame(x1, x2, x3, x4, x5, stringsAsFactors = FALSE)
# 
# # print all results to R-GUI
# df1
# 
# # plot models, just as example
# # ggplot(t2[[1]])
# # ggplot(t2[[1]])
# 
# # call web output with correct column names
# datatable(
#   df1,
#   options = list(
#     columnDefs = list(list(
#       className = 'dt-left', targets = c(0, 1, 2, 3, 4, 5)
#     )),
#     pageLength = MAX,
#     order = list(list(2, 'desc'))
#   ),
#   colnames = c('Num', 'Name', 'Accuracy', 'Kappa', 'time [s]', 'Model name'),
#   caption = paste('Classification results from caret models', Sys.time()),
#   class = 'cell-border stripe'
# )  %>%
#   formatRound('x2', 3) %>%
#   formatRound('x3', 3) %>%
#   formatRound('x4', 3) %>%
#   formatStyle(
#     2,
#     background = styleColorBar(x2, 'steelblue'),
#     backgroundSize = '100% 90%',
#     backgroundRepeat = 'no-repeat',
#     backgroundPosition = 'center'
#   )
# 
# # print confusion matrix example
# caret::confusionMatrix(t2[[1]])
# # XGBoost ####
# 
# modelXGB_sample <- xgboost(
#   data = as.matrix(train.data[, -c(11, 9,10)]),
#   label = as.matrix(train.data[,11]),
#   nrounds = 50,
#   # optimal is 97
#   max_depth = 50,
#   # maximum depth of tree
#   eta = 0.3,
#   # step size shrinkage, learning rate
#   nthread = 4,
#   # number of threads to be used. 16 cores available
#   "gamma" = 0,
#   # minimum loss reduction, controls regularisation
#   objective = "binary:logistic",
#   min_child_weight = 1,
#   # minimum number of instances required in a child node
#   subsample = 1,
#   # controls number of samples supplied to a tree
#   colsample_bytree = 1,
#   # controls number of features supplied to a tree
#   save_period = NULL
# ) # controls number of features supplied to a tree
# 
# test.pred <- predict(modelXGB_sample, newdata = as.matrix(test.data[, -c(11, 9,10)]))
# 
# caret::confusionMatrix(
#   reference = as.matrix(as.factor(test.data$STRIKE)),
#   data = test.pred,
#   mode = 'everything',
#   positive = 'YES'
# )
# 
# valid.pred <- predict(modelXGB_sample, newdata = valid.cl.data)
# 
# caret::confusionMatrix(
#   reference = as.factor(valid.cl.data$STRIKE),
#   data = valid.pred,
#   mode = 'everything',
#   positive = 'YES'
# )


# prob_predXGB_sample <- predict(modelXGB_sample, newdata = as.matrix(test.data[,-c(11,9,10)])) # Predict the Test set results (probabilities)
# predictXGB_sample = ifelse(prob_predXGB_sample > 0.5, 1, 0) # convert probabilities to binary
# 
# cmXGB_sample <- table(predictXGB_sample>0.7, test.data$STRIKE)
# cmXGB_sample # Confusion matrix
# errorXGB_sample <- 100*(1-sum(diag(cmXGB_sample))/sum(cmXGB_sample))
# errorXGB_sample # error rate
# accuracyXGB_sample <- 100 - errorXGB_sample
# accuracyXGB_sample # accuracy rate
# precisionXGB_sample <- 100*cmXGB_sample[2,2]/sum(cmXGB_sample[2,1],cmXGB_sample[2,2]) 
# precisionXGB_sample # precision
# recallXGB_sample <- 100*cmXGB_sample[2,2]/sum(cmXGB_sample[1,2],cmXGB_sample[2,2]) 
# recallXGB_sample # recall
# FscoreXGB_sample <- 2*precisionXGB_sample*recallXGB_sample/(precisionXGB_sample+recallXGB_sample) 
# FscoreXGB_sample # F-score


# xgb.pred <-
#   prediction(prob_predXGB_sample, test.data) 

# xgb.perf <-
#   performance(xgb.pred, "tpr", "fpr")
# plot(
#   xgb.perf,
#   avg = "threshold",
#   colorize = TRUE,
#   lwd = 1,
#   main = "ROC Curve w/ Thresholds",
#   print.cutoffs.at = seq(0, 1, by = 0.05),
#   text.adj = c(-0.5, 0.5),
#   text.cex = 0.1
# )
# grid(col = "lightgray")
# axis(1, at = seq(0, 1, by = 0.1))
# axis(2, at = seq(0, 1, by = 0.1))
# abline(v = c(0.1, 0.3, 0.5, 0.7, 0.9),
#        col = "lightgray",
#        lty = "dotted") abline(h = c(0.1, 0.3, 0.5, 0.7, 0.9),
#                               col = "lightgray",
#                               lty = "dotted") lines(
#                                 x = c(0, 1),
#                                 y = c(0, 1),
#                                 col = "black",
#                                 lty = "dotted"
#                               )
h2o.data <- class.data

# # one-hot-encoding categorical features
# ohe_feats = c('MONTH', 'SEASON')
# 
# # Create dummies
# dummies <- dummyVars(~ MONTH + SEASON, data = h2o.data)
# 
# df.dummies <- as.data.frame(predict(dummies, newdata = h2o.data))

# # Merge Dummies to data frame
# h2o.data <-
#   cbind(h2o.data[, -c(which(colnames(h2o.data) %in% ohe_feats))], df.dummies)

# h2o.data <-
#   subset(h2o.data, select = -c(YEAR.2013, YEAR.2019))


# Create the training and test datasets
set.seed(100)

h2o.data$STRIKE <- as.factor(h2o.data$STRIKE)

# Step 1: Get row numbers for the training data
trainRowNumbers.cl <-
  createDataPartition(h2o.data$STRIKE, p = 0.75, list = FALSE)

# Step 2: Create the training  dataset
train.data <- h2o.data[trainRowNumbers.cl, ]

# Step 3: Create the test dataset
test.data <- h2o.data[-trainRowNumbers.cl, ]

train.data <- as.h2o(train.data)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
test.data <- as.h2o(test.data)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
# Identify predictors and response
y <- "STRIKE"
x <- setdiff(names(h2o.data), c("STRIKE"))

# For binary classification, response should be a factor
train.data[,y] <- as.factor(train.data[,y])
test.data[,y] <- as.factor(test.data[,y])

# Number of CV folds (to generate level-one data for stacking)
nfolds <- 5
# 2. Generate a random grid of models and stack them together

# Some XGboost/GBM /rf hyperparameters
hyper_params <- list(ntrees = seq(10, 1000, 1),
                     learn_rate = seq(0.0001, 0.2, 0.0001),
                     max_depth = seq(1, 20, 1),
                     sample_rate = seq(0.5, 1.0, 0.0001),
                     col_sample_rate = seq(0.2, 1.0, 0.0001))

search_criteria <- list(strategy = "RandomDiscrete",
                        max_models = 10)

grid.id <-  as.character(format(Sys.time(), "%S"))


# Train & Cross-validate a RF
rf_grid <- h2o.grid(algorithm = "drf",
                     grid_id = paste0("grid_binomial_rf_",grid.id),
                     x = x,
                     y = y,
                     training_frame = train.data,
                     seed = 100,
                     nfolds = nfolds,
                     ntrees = 2500,
                     fold_assignment = "Modulo",
                     keep_cross_validation_predictions = TRUE)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |                                                                      |   1%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |=========================                                             |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |============================================                          |  62%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  68%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |==========================================================            |  84%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |=================================================================     |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |======================================================================|  99%
  |                                                                            
  |======================================================================| 100%
gbm_grid <- h2o.grid(algorithm = "gbm",
                     grid_id = paste0("grid_binomial_gbm_",grid.id),
                     x = x,
                     y = y,
                     training_frame = train.data,
                     # ntrees = seq(10, 1000, 1),
                     seed = 100,
                     nfolds = nfolds,
                     fold_assignment = "Modulo",
                     keep_cross_validation_predictions = TRUE,
                     hyper_params = hyper_params,
                     search_criteria = search_criteria)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |===================                                                   |  28%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |=====================                                                 |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |=================================                                     |  48%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |=============================================                         |  65%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |=====================================================                 |  75%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |======================================================                |  78%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |======================================================================| 100%
# Train the grid
xgb_grid <- h2o.grid(algorithm = "xgboost",
                     grid_id = paste0("grid_binomial_xgb_",grid.id),
                     x = x, 
                     y = y,
                     training_frame = train.data,
                     nfolds = nfolds,
                     seed = 100,
                     fold_assignment = "Modulo",
                     keep_cross_validation_predictions = TRUE,
                     hyper_params = hyper_params,
                     search_criteria = search_criteria)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |                                                                      |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |=======                                                               |  11%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |=====================================                                 |  54%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |===============================================================       |  91%
  |                                                                            
  |===================================================================   |  95%
  |                                                                            
  |======================================================================| 100%
# Train a stacked ensemble using the H2O and XGBoost models from above
base.models <- append(gbm_grid@model_ids,
                      xgb_grid@model_ids)

# Train a stacked ensemble using the GBM grid
ensemble <- h2o.stackedEnsemble(x = x,
                                y = y,
                                model_id = paste0("ensemble_gbm_grid_", grid.id, "_1"),
                                training_frame = train.data,
                                base_models = base.models)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
# Eval ensemble performance on a test set
perf <- h2o.performance(ensemble, newdata = test.data)

# Compare to base learner performance on the test set
.getauc <-
  function(mm)
    h2o.auc(h2o.performance(h2o.getModel(mm), newdata = test.data))

baselearner_aucs <- sapply(base.models, .getauc)
baselearner_best_auc_test <- max(baselearner_aucs)
ensemble_auc_test <- h2o.auc(perf)
print(sprintf("Best Base-learner Test AUC:  %s", baselearner_best_auc_test))
## [1] "Best Base-learner Test AUC:  0.736546844424646"
print(sprintf("Ensemble Test AUC:  %s", ensemble_auc_test))
## [1] "Ensemble Test AUC:  0.738965031738709"
# Generate predictions on a test set (if neccessary)
pred <- h2o.predict(ensemble, newdata = test.data)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
# Sort the grid by CV AUC for GBM
get_gbm_grid <- h2o.getGrid(grid_id = gbm_grid@grid_id, sort_by = "AUC", decreasing = TRUE)
get_gbm_grid
## H2O Grid Details
## ================
## 
## Grid ID: grid_binomial_gbm_14 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  ntrees 
##   -  sample_rate 
## Number of models: 10 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing AUC
##    col_sample_rate learn_rate max_depth ntrees sample_rate
## 1           0.9046     0.0228        10     28      0.6884
## 2           0.9203     0.0021        12     76      0.6037
## 3           0.9257      0.007        15    252      0.6716
## 4           0.4466     0.0379         4    632      0.7663
## 5           0.3739     0.0261        10    427      0.6678
## 6           0.3326     0.0291        19    277      0.8742
## 7           0.4807     0.0334         7    866      0.9561
## 8           0.3513     0.0821        20    486      0.9026
## 9           0.6821     0.0334        14    728      0.6761
## 10          0.8212     0.1342         8    441      0.9764
##                        model_ids                auc
## 1   grid_binomial_gbm_14_model_5 0.7181182460928357
## 2   grid_binomial_gbm_14_model_2  0.715857633046727
## 3   grid_binomial_gbm_14_model_4 0.7130017267038504
## 4   grid_binomial_gbm_14_model_7 0.7113849843006849
## 5   grid_binomial_gbm_14_model_3 0.7042723775961279
## 6   grid_binomial_gbm_14_model_9 0.7028150572108655
## 7   grid_binomial_gbm_14_model_6 0.6933411498699452
## 8   grid_binomial_gbm_14_model_8 0.6854323163003493
## 9   grid_binomial_gbm_14_model_1 0.6843664851640369
## 10 grid_binomial_gbm_14_model_10 0.6753911580397716
gbm_grid_top_model <- get_gbm_grid@summary_table[1, "model_ids"]
gbm_grid_top_model
## [1] "grid_binomial_gbm_14_model_5"
# Sort the grid by CV AUC for XGBOOST
get_xgb_grid <- h2o.getGrid(grid_id = xgb_grid@grid_id, sort_by = "AUC", decreasing = TRUE)
get_xgb_grid
## H2O Grid Details
## ================
## 
## Grid ID: grid_binomial_xgb_14 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  ntrees 
##   -  sample_rate 
## Number of models: 10 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing AUC
##    col_sample_rate learn_rate max_depth ntrees sample_rate
## 1           0.6801     0.0616         6     12      0.5436
## 2           0.7238       0.01         9    227      0.6867
## 3           0.9543     0.0446        16    330      0.6907
## 4           0.5262     0.0409        11    793      0.7308
## 5           0.7688     0.0963        15    249      0.6112
## 6           0.5476     0.0998        13    430      0.9742
## 7           0.3023     0.1302        12    723      0.5747
## 8           0.4026     0.0823        13    939      0.9523
## 9           0.6441     0.1299         3    856       0.768
## 10          0.3266     0.1656        13    922      0.6382
##                        model_ids                auc
## 1   grid_binomial_xgb_14_model_4 0.7269703634027106
## 2  grid_binomial_xgb_14_model_10 0.7204998608921451
## 3   grid_binomial_xgb_14_model_1 0.6984243275349646
## 4   grid_binomial_xgb_14_model_5 0.6953767614808142
## 5   grid_binomial_xgb_14_model_2 0.6926791731252456
## 6   grid_binomial_xgb_14_model_6 0.6918023520267793
## 7   grid_binomial_xgb_14_model_8 0.6901940002561351
## 8   grid_binomial_xgb_14_model_7 0.6883440865913276
## 9   grid_binomial_xgb_14_model_3  0.686505213232469
## 10  grid_binomial_xgb_14_model_9 0.6820195369254073
xgb_grid_top_model <- get_xgb_grid@summary_table[1, "model_ids"]
xgb_grid_top_model
## [1] "grid_binomial_xgb_14_model_4"
# Sort the grid by CV AUC for XGBOOST
get_rf_grid <- h2o.getGrid(grid_id = rf_grid@grid_id, sort_by = "AUC", decreasing = TRUE)
get_rf_grid
## H2O Grid Details
## ================
## 
## Grid ID: grid_binomial_rf_14 
## Used hyper parameters: 
## Number of models: 1 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing AUC
##                     model_ids                auc
## 1 grid_binomial_rf_14_model_1 0.7211307039740685
rf_grid_top_model <- get_rf_grid@summary_table[1, "model_ids"]
rf_grid_top_model
## [1] "grid_binomial_rf_14_model_1"
# Use AutoML to find a list of candidate models (i.e., leaderboard)
auto_ml <- h2o.automl(
  x = x,
  y = y,
  training_frame = train.data,
  nfolds = 5,
  max_runtime_secs = 60 * 120,
  max_models = 10,
  keep_cross_validation_predictions = FALSE,
  sort_metric = "auc",
  seed = 123,
  stopping_rounds = 50,
  stopping_metric = "auc",
  stopping_tolerance = 0
)
## 
  |                                                                            
  |                                                                      |   0%
## 23:17:57.268: Stopping tolerance set by the user is < 70% of the recommended default of 0.018217988943396556, so models may take a long time to converge or may not converge at all.
  |                                                                            
  |==                                                                    |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |============                                                          |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |===================                                                   |  28%
  |                                                                            
  |=====================                                                 |  31%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |======================================================================| 100%
# Assess the leader board; the following truncates the results to show the top 
# and bottom 15 models. You can get the top model with auto_ml@leader
auto_ml@leaderboard %>% 
  as.data.frame() %>%
  dplyr::select(model_id, auc) %>%
  dplyr::slice(1:25)
##                                               model_id       auc
## 1  StackedEnsemble_BestOfFamily_AutoML_20200226_231757 0.7370301
## 2     StackedEnsemble_AllModels_AutoML_20200226_231757 0.7367382
## 3                     XGBoost_3_AutoML_20200226_231757 0.7325164
## 4                         GLM_1_AutoML_20200226_231757 0.7324252
## 5                     XGBoost_2_AutoML_20200226_231757 0.7291813
## 6                     XGBoost_1_AutoML_20200226_231757 0.7254258
## 7                         DRF_1_AutoML_20200226_231757 0.7101999
## 8                         GBM_5_AutoML_20200226_231757 0.6974415
## 9                         GBM_1_AutoML_20200226_231757 0.6872763
## 10                        GBM_2_AutoML_20200226_231757 0.6792325
## 11                        GBM_4_AutoML_20200226_231757 0.6751355
## 12                        GBM_3_AutoML_20200226_231757 0.6739104